package com.kdtech.analyse.news;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;


//无效网址
public class DtnewsNewsAnalyse implements AnalyseNews{

			private static final String[] regex={
				"http://www.dtnews.cn/[0-9]{4}/[a-z]+/[A-Z0-9]+.html",
			};
			
			public boolean isDetailPage(String url) {
				for (int i=0; i < regex.length; i++) {
					if (url.matches(regex[i])) {
						return true;
					}
				}
				return false;
			}

			
			public NewsMeta Update(NewsMeta meta) {
				return null;
			}



			
			public NewsMeta parserHtml(UrlMeta urlMeta) {
				NewsMeta news=new NewsMeta();
				if (urlMeta.getHtml() == null) {
				}
				String htmltxt=urlMeta.getHtml();
				String url=urlMeta.getUrl();
				if(!isDetailPage(url)){
				}


				String title=null;
				String content=null;
				String author=null;
				Long date=null;
				Document doc=Jsoup.parse(htmltxt);
				title=doc.select("title").text();
				title=StringUtils.substringBefore(title, "-");
				content=HtmlCleaner.getContentHtml(url,doc.select(".news_cont"));

				/*
				 * 作者
				 */
				Matcher matcher = Pattern.compile("document.write\\(unescape.*id=([0-9]+)&url").matcher(htmltxt);
				if(matcher.find()){
					String id = matcher.group(1);
					String dateUrl = "http://w.dtnews.cn/c/shownews/zuozhe.ashx?id="+id+"&url="+url;
					UrlMeta dateMeta = CrawlHTML.responseToURL(dateUrl);
					if(dateMeta != null && StringUtils.isNotBlank(dateMeta.getHtml())){
						String datetxt = dateMeta.getHtml();
						author = StringUtils.substringBetween(datetxt, "来源：", "&nbsp;");
						date = DateUtils.matchDate(StringUtils.substringBetween(datetxt, "<span>", "</span>"));
					}
				}
				news.setUrl(url);
				news.setTitle(title);
				news.setContent(content);
				news.setAuthor(author);
				news.setDate(date);
				return news;
			}

			public static void main(String[] args) {
				DtnewsNewsAnalyse test=new DtnewsNewsAnalyse();

				String url="http://www.dtnews.cn/2015/may/F6A20296.html";
				UrlMeta meta=CrawlHTML.responseToURL(url);
				System.out.println(test.isDetailPage(url));
				NewsMeta parserHtml=test.parserHtml(meta);
				System.out.println(parserHtml);
			}
	
	public boolean isNeedUpdate(){
		return false;
	}
}
